implement importChanges optimisaton (not used yet)

author Joey Hess <joeyh@joeyh.name>

Wed, 31 May 2023 19:45:23 +0000 (15:45 -0400)

committer Joey Hess <joeyh@joeyh.name>

Wed, 31 May 2023 20:01:34 +0000 (16:01 -0400)
author Joey Hess <joeyh@joeyh.name>
Wed, 31 May 2023 19:45:23 +0000 (15:45 -0400)
committer Joey Hess <joeyh@joeyh.name>
Wed, 31 May 2023 20:01:34 +0000 (16:01 -0400)
diff --git a/Annex/Import.hs b/Annex/Import.hs

index fccbc5dda4a91b64c1a0b71341ab60830408d662..e13b604127b071cd732dcc69cc2a78e962ae14b2 100644 (file)
--- a/Annex/Import.hs
+++ b/Annex/Import.hs
@@ -14,6 +14,8 @@ module Annex.Import (
         buildImportTrees,
         recordImportTree,
         canImportKeys,
+       ImportResult(..),
+       importChanges,
         importKeys,
         makeImportMatcher,
         getImportableContents,
@@ -27,6 +29,7 @@ import Git.Tree
  import Git.Sha
  import Git.FilePath
  import Git.History
+import qualified Git.DiffTree
  import qualified Git.Ref
  import qualified Git.Branch
  import qualified Annex
@@ -47,6 +50,7 @@ import Messages.Progress
  import Utility.DataUnits
  import Utility.Metered
  import Utility.Hash (sha1s)
+import Logs.Import
  import Logs.Export
  import Logs.Location
  import Logs.PreferredContent
@@ -300,22 +304,30 @@ convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls
   -}
  buildContentIdentifierTree
         :: ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-       -> Annex (History Sha)
-buildContentIdentifierTree = 
-       buildImportTreesGeneric convertContentIdentifierTree emptyTree Nothing
+       -> Annex (History Sha, M.Map Sha (ContentIdentifier, ByteSize))
+buildContentIdentifierTree importable = do
+       mv <- liftIO $ newTVarIO M.empty
+       r <- buildImportTreesGeneric (convertContentIdentifierTree mv) emptyTree Nothing importable
+       m <- liftIO $ atomically $ readTVar mv
+       return (r, m)
  
  {- For speed, and to avoid bloating the repository, the ContentIdentifiers
   - are not actually checked into git, instead a sha1 hash is calculated
   - internally.
   -}
  convertContentIdentifierTree
-       :: Maybe TopFilePath
+       :: TVar (M.Map Sha (ContentIdentifier, ByteSize))
+       -> Maybe TopFilePath
         -> [(ImportLocation, (ContentIdentifier, ByteSize))]
         -> Annex Tree
-convertContentIdentifierTree _ ls = pure $ treeItemsToTree $ map mktreeitem ls
+convertContentIdentifierTree mv _ ls = do
+       let (tis, ml) = unzip (map mktreeitem ls)
+       liftIO $ atomically $ modifyTVar' mv $
+               M.union (M.fromList ml)
+       return (treeItemsToTree tis)
    where
-       mktreeitem (loc, ((ContentIdentifier cid), _sz)) = 
-               TreeItem p mode sha1
+       mktreeitem (loc, v@((ContentIdentifier cid), _sz)) =
+               (TreeItem p mode sha1, (sha1, v))
           where
                 p = asTopFilePath (fromImportLocation loc)
                 mode = fromTreeItemType TreeFile
@@ -414,6 +426,90 @@ canImportKeys remote importcontent =
    where
         ia = Remote.importActions remote
  
+data Diffed t
+       = DiffChanged t
+       | DiffRemoved
+
+{- Diffs between the current and previous ContentIdentifier trees, and 
+ - runs importKeys on only the changed files.
+ -
+ - This will download the same content as if importKeys were run on all
+ - files, but this speeds it up significantly when there are a lot of files
+ - and only a few have changed. importKeys has to look up each
+ - ContentIdentifier to see if a Key is known for it. This avoids doing
+ - that lookup on files that have not changed.
+ -
+ - Diffing is not currently implemented when there is a History.
+ -}
+importChanges
+       :: Remote
+       -> ImportTreeConfig
+       -> Bool
+       -> Bool
+       -> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
+       -> Annex (ImportResult (Either
+               (ImportableContentsChunkable Annex (Either Sha Key))
+               (ImportableContentsChunkable Annex (Diffed (Either Sha Key)))))
+importChanges remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
+       ((History currcidtree currhistory), cidtreemap) <- buildContentIdentifierTree importablecontents
+       -- diffimport below does not handle history, so when there is
+       -- history, do a full import.
+       if not (S.null currhistory)
+               then fullimport currcidtree
+               else do
+                       getContentIdentifierTree (Remote.uuid remote) >>= \case
+                               Nothing -> fullimport currcidtree
+                               Just prevcidtree -> diffimport cidtreemap prevcidtree currcidtree
+  where
+       remember = recordContentIdentifierTree (Remote.uuid remote)
+
+       fullimport currcidtree = 
+               importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents >>= \case
+                       ImportUnfinished -> return ImportUnfinished
+                       ImportFinished r -> do
+                               remember currcidtree
+                               return $ ImportFinished $ Left r
+       
+       diffimport cidtreemap prevcidtree currcidtree = do
+               (diff, cleanup) <- inRepo $ Git.DiffTree.diffTreeRecursive currcidtree prevcidtree
+               let (removed, changed) = partition (\ti -> Git.DiffTree.dstsha ti `elem` nullShas) diff
+               let mkloc = mkImportLocation . getTopFilePath . Git.DiffTree.file
+               let mkicchanged ti = do
+                       v <- M.lookup (Git.DiffTree.dstsha ti) cidtreemap
+                       return (mkloc ti, v)
+               let ic = ImportableContentsComplete $ ImportableContents
+                               { importableContents = mapMaybe mkicchanged changed
+                               , importableHistory = []
+                               }
+               importKeys remote importtreeconfig importcontent thirdpartypopulated ic >>= \case
+                       ImportUnfinished -> do
+                               void $ liftIO cleanup
+                               return ImportUnfinished
+                       ImportFinished (ImportableContentsComplete ic') -> liftIO cleanup >>= \case
+                               False -> return ImportUnfinished
+                               True -> do
+                                       remember currcidtree
+                                       let diffchanged = map
+                                               (\(loc, v) -> (loc, DiffChanged v))
+                                               (importableContents ic')
+                                       let diffremoved = map
+                                               (\ti -> (mkloc ti, DiffRemoved))
+                                               removed
+                                       let ic'' = ImportableContentsComplete $ ImportableContents
+                                               { importableContents = diffremoved ++ diffchanged
+                                               , importableHistory = []
+                                               }
+                                       return $ ImportFinished $ Right ic''
+                       -- importKeys is not passed ImportableContentsChunked
+                       -- above, so it cannot return it
+                       ImportFinished (ImportableContentsChunked {}) -> error "internal"
+
+-- Result of an import. ImportUnfinished indicates that some file failed to
+-- be imported. Running again should resume where it left off.
+data ImportResult t
+       = ImportFinished t
+       | ImportUnfinished
+
  {- Downloads all new ContentIdentifiers, or when importcontent is False,
   - generates Keys without downloading.
   -
@@ -423,9 +519,6 @@ canImportKeys remote importcontent =
   -
   - Supports concurrency when enabled.
   -
- - If it fails on any file, the whole thing fails with Nothing, 
- - but it will resume where it left off.
- -
   - Note that, when a ContentIdentifier has been imported before,
   - generates the same thing that was imported before, so annex.largefiles
   - is not reapplied.
@@ -436,10 +529,8 @@ importKeys
         -> Bool
         -> Bool
         -> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-       -> Annex (Maybe (ImportableContentsChunkable Annex (Either Sha Key)))
+       -> Annex (ImportResult (ImportableContentsChunkable Annex (Either Sha Key)))
  importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
-       _ts <- buildContentIdentifierTree importablecontents
-       -- TODO use above
         unless (canImportKeys remote importcontent) $
                 giveup "This remote does not support importing without downloading content."
         -- This map is used to remember content identifiers that
@@ -476,13 +567,13 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
                 case importablecontents of
                         ImportableContentsComplete ic ->
                                 go False largematcher cidmap importing db ic >>= return . \case
-                                       Nothing -> Nothing
-                                       Just v -> Just $ ImportableContentsComplete v
+                                       Nothing -> ImportUnfinished
+                                       Just v -> ImportFinished $ ImportableContentsComplete v
                         ImportableContentsChunked {} -> do
                                 c <- gochunked db (importableContentsChunk importablecontents)
                                 gohistory largematcher cidmap importing db (importableHistoryComplete importablecontents) >>= return . \case
-                                       Nothing -> Nothing
-                                       Just h -> Just $ ImportableContentsChunked
+                                       Nothing -> ImportUnfinished
+                                       Just h -> ImportFinished $ ImportableContentsChunked
                                                 { importableContentsChunk = c
                                                 , importableHistoryComplete = h
                                                 }
diff --git a/Annex/Locations.hs b/Annex/Locations.hs

index c8ddc5cc96f0cf515420ae63f5526dbe8fb946c3..0a0ef438e1d14d767a330634429721d2d76e7abd 100644 (file)
--- a/Annex/Locations.hs
+++ b/Annex/Locations.hs
@@ -1,6 +1,6 @@
  {- git-annex file locations
   -
- - Copyright 2010-2022 Joey Hess <id@joeyh.name>
+ - Copyright 2010-2023 Joey Hess <id@joeyh.name>
   -
   - Licensed under the GNU AGPL version 3 or higher.
   -}
@@ -59,6 +59,8 @@ module Annex.Locations (
         gitAnnexExportLock,
         gitAnnexExportUpdateLock,
         gitAnnexExportExcludeLog,
+       gitAnnexImportDir,
+       gitAnnexImportLog,
         gitAnnexContentIdentifierDbDir,
         gitAnnexContentIdentifierLock,
         gitAnnexScheduleState,
@@ -438,6 +440,16 @@ gitAnnexContentIdentifierDbDir r c =
  gitAnnexContentIdentifierLock :: Git.Repo -> GitConfig -> RawFilePath
  gitAnnexContentIdentifierLock r c = gitAnnexContentIdentifierDbDir r c <> ".lck"
  
+{- .git/annex/import/ is used to store information about
+ - imports from special remotes. -}
+gitAnnexImportDir :: Git.Repo -> GitConfig -> RawFilePath
+gitAnnexImportDir r c = fromMaybe (gitAnnexDir r) (annexDbDir c) P.</> "import"
+
+{- File containing state about the last import done from a remote. -}
+gitAnnexImportLog :: UUID -> Git.Repo -> GitConfig -> RawFilePath
+gitAnnexImportLog u r c = 
+       gitAnnexImportDir r c P.</> fromUUID u P.</> "log"
+
  {- .git/annex/schedulestate is used to store information about when
   - scheduled jobs were last run. -}
  gitAnnexScheduleState :: Git.Repo -> RawFilePath
diff --git a/Command/Import.hs b/Command/Import.hs

index e00af9fdfcfb4c41301781f66cf8593824dc0dbc..a18c22aa3060dfa41a6faa0c0837118b4a743459 100644 (file)
--- a/Command/Import.hs
+++ b/Command/Import.hs
@@ -336,12 +336,12 @@ seekRemote remote branch msubdir importcontent ci = do
         liftIO (atomically (readTVar importabletvar)) >>= \case
                 Nothing -> return ()
                 Just importable -> importKeys remote importtreeconfig importcontent False importable >>= \case
-                       Nothing -> warning $ UnquotedString $ concat
+                       ImportUnfinished -> warning $ UnquotedString $ concat
                                 [ "Failed to import some files from "
                                 , Remote.name remote
                                 , ". Re-run command to resume import."
                                 ]
-                       Just imported -> void $
+                       ImportFinished imported -> void $
                                 includeCommandAction $ 
                                         commitimport imported
    where
diff --git a/Command/Sync.hs b/Command/Sync.hs

index d6815618e56398044ddecb1c03c57cf93ce5d11e..ed93e36131abcb7661be3a3103561c13a7868f9d 100644 (file)
--- a/Command/Sync.hs
+++ b/Command/Sync.hs
@@ -587,12 +587,12 @@ pullThirdPartyPopulated o remote
                 Command.Import.listContents' remote ImportTree (CheckGitIgnore False) go
    where
         go (Just importable) = importKeys remote ImportTree False True importable >>= \case
-               Just importablekeys -> do
+               ImportFinished importablekeys -> do
                         (_imported, updatestate) <- recordImportTree remote ImportTree importablekeys
                         next $ do
                                 updatestate
                                 return True
-               Nothing -> next $ return False
+               ImportUnfinished -> next $ return False
         go Nothing = next $ return True -- unchanged from before
  
         ai = ActionItemOther (Just (UnquotedString (Remote.name remote)))
diff --git a/Git/FilePath.hs b/Git/FilePath.hs

index fde748593008e38f66333a03dc11a98af4abc956..b27c0c70594a73dfcb6141eaefd0d6af6af18f32 100644 (file)
--- a/Git/FilePath.hs
+++ b/Git/FilePath.hs
@@ -46,7 +46,7 @@ instance NFData TopFilePath
  {- A file in a branch or other treeish. -}
  data BranchFilePath = BranchFilePath Ref TopFilePath
         deriving (Show, Eq, Ord)
-
+ 
  {- Git uses the branch:file form to refer to a BranchFilePath -}
  descBranchFilePath :: BranchFilePath -> StringContainingQuotedPath
  descBranchFilePath (BranchFilePath b f) =
diff --git a/Logs/Import.hs b/Logs/Import.hs

new file mode 100644 (file)

index 0000000..7d3e0ea
--- /dev/null
+++ b/Logs/Import.hs
@@ -0,0 +1,37 @@
+{- git-annex import logs
+ -
+ - Copyright 2023 Joey Hess <id@joeyh.name>
+ -
+ - Licensed under the GNU AGPL version 3 or higher.
+ -}
+
+module Logs.Import (
+       recordContentIdentifierTree,
+       getContentIdentifierTree
+) where
+
+import Annex.Common
+import Git.Types
+import Git.Sha
+import Logs.File
+
+import qualified Data.ByteString.Lazy as L
+
+{- Records the sha of a tree that contains hashes of ContentIdentifiers
+ - that were imported from a remote. -}
+recordContentIdentifierTree :: UUID -> Sha -> Annex ()
+recordContentIdentifierTree u t = do
+       l <- calcRepo' (gitAnnexImportLog u)
+       writeLogFile l (fromRef t)
+
+{- Gets the tree last recorded for a remote. -}
+getContentIdentifierTree :: UUID -> Annex (Maybe Sha)
+getContentIdentifierTree u = do
+       l <- calcRepo' (gitAnnexImportLog u)
+       -- This is safe because the log file is written atomically.
+       calcLogFileUnsafe l Nothing update
+  where
+       update l Nothing = extractSha (L.toStrict l)
+       -- Subsequent lines are ignored. This leaves room for future
+       -- expansion of what is logged.
+       update _l (Just l) = Just l
diff --git a/Types/Import.hs b/Types/Import.hs

index db5b25d39f85a4c07cc29307875160b3f7e16e56..9b0fa226d64eb089b9a7630d422cb22153260c1b 100644 (file)
--- a/Types/Import.hs
+++ b/Types/Import.hs
@@ -1,11 +1,11 @@
  {- git-annex import types
   -
- - Copyright 2019-2021 Joey Hess <id@joeyh.name>
+ - Copyright 2019-2023 Joey Hess <id@joeyh.name>
   -
   - Licensed under the GNU AGPL version 3 or higher.
   -}
  
-{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE DeriveGeneric, DeriveFunctor #-}
  
  module Types.Import where
  
@@ -67,7 +67,7 @@ data ImportableContents info = ImportableContents
         -- locations. So, if a remote does not support Key/Value access,
         -- it should not populate the importableHistory.
         }
-       deriving (Show, Generic)
+       deriving (Show, Generic, Functor)
  
  instance NFData info => NFData (ImportableContents info)
  
@@ -81,6 +81,7 @@ data ImportableContentsChunkable m info
                 , importableHistoryComplete :: [ImportableContents info]
                 -- ^ Chunking the history is not supported
                 }
+       deriving (Functor)
  
  {- A chunk of ImportableContents, which is the entire content of a subtree
   - of the main tree. Nested subtrees are not allowed. -}
@@ -92,6 +93,7 @@ data ImportableContentsChunk m info = ImportableContentsChunk
         -- ^ Continuation to get the next chunk.
         -- Returns Nothing when there are no more chunks.
         }
+       deriving (Functor)
  
  newtype ImportChunkSubDir = ImportChunkSubDir { importChunkSubDir :: RawFilePath }
  
diff --git a/git-annex.cabal b/git-annex.cabal

index 22342240fc51e0a842d0e6123cc5c60e7c4b9b76..e70b7587bc76f554ddb41e5cce00979a840b2d3d 100644 (file)
--- a/git-annex.cabal
+++ b/git-annex.cabal
@@ -910,6 +910,7 @@ Executable git-annex
      Logs.File
      Logs.FsckResults
      Logs.Group
+    Logs.Import
      Logs.Line
      Logs.Location
      Logs.MapLog
author	Joey Hess <joeyh@joeyh.name>
	Wed, 31 May 2023 19:45:23 +0000 (15:45 -0400)
committer	Joey Hess <joeyh@joeyh.name>
	Wed, 31 May 2023 20:01:34 +0000 (16:01 -0400)
Annex/Import.hs		patch \| blob \| history
Annex/Locations.hs		patch \| blob \| history
Command/Import.hs		patch \| blob \| history
Command/Sync.hs		patch \| blob \| history
Git/FilePath.hs		patch \| blob \| history
Logs/Import.hs	[new file with mode: 0644]	patch \| blob
Types/Import.hs		patch \| blob \| history
git-annex.cabal		patch \| blob \| history